format: html: embed-resources: true
API_KEY='396dd8714fbc4b4fa24b537d26e3879e'
import requests
import json
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed). from pandas.core.computation.check import NUMEXPR_INSTALLED
baseURL = "https://newsapi.org/v2/everything?"
total_requests=2
verbose=True
TOPIC='Disney'
def FormURL(TOPIC):
URLpost = {'apiKey': API_KEY,
'q': '+'+TOPIC,
'sortBy': 'relevancy',
'totalRequests': 1}
# print(baseURL)
# print(URLpost)
#GET DATA FROM API
response = requests.get(baseURL, URLpost) #request data from the server
# print(response.url);
response = response.json() #extract txt data from request into json
return(response)
# PRETTY PRINT
# https://www.digitalocean.com/community/tutorials/python-pretty-print-json
# print(json.dumps(response, indent=2))
# #GET TIMESTAMP FOR PULL REQUEST
# timestamp = datetime.now().strftime("%Y-%m-%d-H%H-M%M-S%S")
# SAVE TO FILE
# with open(timestamp+'-newapi-raw-data.json', 'w') as outfile:
# json.dump(response, outfile, indent=4)
def string_cleaner(input_string):
try:
out=re.sub(r"""
[,.;@#?!&$-]+ # Accept one or more copies of punctuation
\ * # plus zero or more copies of a space,
""",
" ", # and replace it with a single space
input_string, flags=re.VERBOSE)
#REPLACE SELECT CHARACTERS WITH NOTHING
out = re.sub('[’.]+', '', input_string)
#ELIMINATE DUPLICATE WHITESPACES USING WILDCARDS
out = re.sub(r'\s+', ' ', out)
#CONVERT TO LOWER CASE
out=out.lower()
except:
print("ERROR")
out=''
return out
def CleanJSON(response, TOPIC):
article_list=response['articles'] #list of dictionaries for each article
article_keys=article_list[0].keys()
#print("AVAILABLE KEYS:")
#print(article_keys)
#index=0
cleaned_data=[];
text_description = [];
for article in article_list:
tmp=[]
for key in article_keys:
# if(key=='source'):
# src=string_cleaner(article[key]['name'])
# tmp.append(src)
# if(key=='author'):
# author=string_cleaner(article[key])
#ERROR CHECK (SOMETIMES AUTHOR IS SAME AS PUBLICATION)
# if(src in author):
# print(" AUTHOR ERROR:",author);author='NA'
# tmp.append(author)
if(key=='title'):
text_description.append(string_cleaner(article[key]))
if(key=='description'):
text_description.append(string_cleaner(article[key]))
if(key=='content'):
tmp.append(string_cleaner(article[key]))
# if(key=='publishedAt'):
#DEFINE DATA PATERN FOR RE TO CHECK .* --> wildcard
# ref = re.compile('.*-.*-.*T.*:.*:.*Z')
# date=article[key]
# if(not ref.match(date)):
# print(" DATE ERROR:",date); date="NA"
# tmp.append(date)
# index += 1
cleaned_data.append(tmp)
df = pd.DataFrame(cleaned_data)
#print(df)
#print(cleaned_data)
#print(text_description)
df.to_csv(TOPIC+'_cleaned_news.csv', index=False)
return text_description, cleaned_data
import wikipedia
with open(TOPIC+"_cleaned_wiki.txt", "w") as file:
file.write(wikipedia.summary(TOPIC))
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
#import matplotlib
# MODIFIED FROM
# https://towardsdatascience.com/simple-wordcloud-in-python-2ae54a9f58e5
def generate_word_cloud(my_text):
# exit()
# Import package
# Define a function to plot word cloud
#plt.rcParams['font.family'] = 'DejaVu Sans'
def plot_cloud(wordcloud):
# Set figure size
plt.figure(figsize=(40, 30))
# Display image
plt.imshow(wordcloud)
# No axis details
plt.axis("off");
# Generate word cloud
wordcloud = WordCloud(
width = 3000,
height = 2000,
random_state=1,
background_color='salmon',
colormap='Pastel1',
collocations=False,
stopwords = STOPWORDS).generate(my_text)
plot_cloud(wordcloud)
plt.show()
#generate_word_cloud(text)
generate_word_cloud(str(CleanJSON(FormURL(TOPIC), TOPIC)[0]))
generate_word_cloud(wikipedia.summary(TOPIC))
im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars] [['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]']]
import pandas as pd
import numpy as np
# read data
df = pd.read_csv("../data/raw-data/news_r.csv")
## REMOVE SPACES FROM COLUMN NAMES
df.rename(columns=lambda x: x.strip(), inplace=True)
df.rename(columns={"publishedAt": 'date'}, inplace=True)
df = df.drop(columns=["Unnamed: 0"])
#CONVERT TYPECAST
df["date"] = pd.to_datetime(df["date"])
df = df.drop_duplicates()
df['content'] = df['content'].apply(lambda x : np.nan if "http" in x else x)
df['description'] = df['description'].apply(lambda x : np.nan if "http" in x else x)
df = df.replace('[Removed]', np.nan)
df = df.dropna(axis = 0)
for char in ['[', ']', '&', '$', '<ul>', '<li>', '</li>', '+', 'chars', 'amp', '#', '…', '®', '{', '}', '...', '%']:
df['content'] = df['content'].apply(lambda x : x.replace(char, ''))
df['description'] = df['description'].apply(lambda x : x.replace(char, ''))
df['title'] = df['title'].apply(lambda x : x.replace(char, ''))
import re
df['content'] = df['content'].apply(lambda x: re.sub(r'[0-9]', '', x))
df['description'] = df['description'].apply(lambda x: re.sub(r'[0-9]', '', x))
df['title'] = df['title'].apply(lambda x: re.sub(r'[0-9]', '', x))
df.to_csv("../data/modified-data/clean_text.csv")
from sklearn.feature_extraction.text import CountVectorizer
corpus = df['title'].tolist()
corpus.extend(df['description'].tolist())
vectorizer = CountVectorizer()
vectorizer.fit(corpus)
print("Vocabulary: ", vectorizer.vocabulary_)
import csv
with open('../data/clean-data/vocabulary.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
for key, value in vectorizer.vocabulary_.items():
writer.writerow([key, value])
vector = vectorizer.transform(corpus)
# Summarizing the Encoded Texts
print("Encoded Document is:\n", vector.toarray())
pd.DataFrame(vector.toarray()).to_csv("../data/modified-data/encode.csv")
Vocabulary: {'sunny': 1112, 'or': 796, 'akshay': 37, 'who': 1298, 'will': 1303, 'you': 1335, 'watch': 1275, 'loki': 656, 'season': 997, 'temporal': 1143, 'loom': 663, 'explained': 387, 'what': 1290, 'is': 586, 'the': 1151, 'tva': 1212, 'machine': 670, 'that': 1150, 'created': 262, 'mcu': 695, 'sacred': 979, 'timeline': 1177, 'stack': 1071, 'spotify': 1070, 'to': 1182, 'launch': 623, 'video': 1247, 'ads': 23, 'on': 788, 'roku': 967, 'meta': 708, 'considers': 243, 'ad': 17, 'free': 448, 'tier': 1171, 'in': 563, 'eu': 370, 'my': 743, 'disney': 315, 'top': 1191, 'mickey': 711, 'not': 770, 'so': 1046, 'scary': 987, 'halloween': 504, 'party': 826, 'today': 1183, 'wordle': 1314, 'hints': 530, 'clues': 220, 'and': 58, 'answer': 63, 'for': 441, 'friday': 450, 'october': 776, 'th': 1147, 'it': 589, 'pivotal': 842, 'year': 1330, 'labor': 615, 'strikes': 1093, 'charts': 207, 'tell': 1141, 'story': 1084, 'americans': 52, 'are': 73, 'unleashing': 1222, 'monsters': 727, 'they': 1157, 'have': 516, 'no': 767, 'idea': 554, 'how': 548, 'contain': 245, 'make': 677, 'your': 1337, 'home': 540, 'merry': 707, 'with': 1308, 'holiday': 537, 'decor': 290, 'netflix': 751, 'india': 570, 'subscribers': 1099, 'dwarfed': 335, 'by': 171, 'prime': 874, 'bernstein': 123, 'says': 985, 'mission': 721, 'raniganj': 903, 'box': 143, 'office': 782, 'collection': 225, 'can': 178, 'kumar': 613, 'deliver': 294, 'back': 95, 'superhits': 1115, 'day': 278, 'doesn': 320, 'look': 660, 'promising': 884, 'shares': 1023, 'rebound': 917, 'amid': 53, 'broader': 157, 'market': 684, 'downturn': 328, 'lego': 636, 'pixar': 843, 'up': 1230, 'house': 547, 'building': 163, 'set': 1016, 'only': 791, 'why': 1300, 'wasn': 1274, 'ob': 775, 'memory': 703, 'wiped': 1307, 'he': 518, 'remains': 930, 'epic': 362, 'games': 463, 'fortnite': 444, 'longer': 659, 'bringing': 153, 'bacon': 97, 'addition': 18, 'layoffs': 628, 'we': 1282, 're': 912, 'going': 483, 'charge': 204, 'tv': 1211, 'film': 421, 'producers': 878, 'like': 643, 'seat': 999, 'unreal': 1224, 'engine': 358, 'users': 1235, 'followup': 438, 'world': 1319, 'cup': 271, 'shubman': 1034, 'gill': 474, 'reportedly': 940, 'down': 327, 'dengue': 296, 'might': 714, 'miss': 720, 'opener': 793, 'match': 689, 'against': 29, 'australia': 85, 'where': 1293, 'find': 427, 'birmingham': 132, 'city': 214, 'vs': 1261, 'west': 1289, 'brom': 159, 'us': 1234, 'nct': 747, 'became': 110, 'pop': 856, 'unapologetic': 1218, 'mavericks': 691, 'marvel': 687, 'sets': 1017, 'tom': 1186, 'hiddleston': 525, 'replacement': 938, 'post': 860, 'credits': 264, 'scene': 988, 'legal': 634, 'drama': 329, 'suits': 1109, 'which': 1295, 'streams': 1091, 'peacock': 830, 'hits': 534, 'weeks': 1285, 'at': 78, 'nielsen': 761, 'streaming': 1090, 'rankings': 906, 'setting': 1018, 'record': 922, 'most': 732, 'rick': 958, 'porter': 859, 'hollywood': 539, 'reporter': 941, 'all': 41, 'ahsoka': 34, 'episodes': 364, 'ranked': 904, 'from': 452, 'worst': 1322, 'wizard': 1310, 'ein': 344, 'li': 640, 'eretz': 367, 'acheret': 7, 'other': 799, 'land': 617, 'shell': 1028, 'malaysia': 679, 'launches': 625, 'star': 1073, 'wars': 1272, 'racers': 898, 'remote': 932, 'control': 250, 'cars': 186, 'designs': 298, 'rm': 965, 'eastpak': 340, 'six': 1039, 'years': 1332, 'after': 27, 'metoo': 710, 'entertainment': 359, 'employees': 352, 'believe': 119, 'culture': 270, 'of': 779, 'abuse': 2, 'misconduct': 719, 'has': 512, 'improved': 562, 'survey': 1125, 'finds': 428, 'episode': 363, 'ending': 355, 'premiere': 865, 'mobius': 723, 'mostly': 733, 'excellent': 376, 'adventure': 24, 'end': 354, 'time': 1175, 'slipping': 1042, 'god': 481, 'mischief': 718, 'keeps': 604, 'glitching': 477, 'teases': 1138, 'kang': 603, 'secret': 1001, 'connection': 241, 'spoiler': 1064, 'ep': 360, 'about': 1, 'straddling': 1085, 'line': 644, 'between': 126, 'intrigue': 582, 'confusion': 240, 'stay': 1077, 'tuned': 1207, 'because': 111, 'biggest': 130, 'questions': 895, 'review': 956, 'slip': 1041, 'slide': 1040, 'must': 742, 'see': 1004, 'ties': 1172, 'thor': 1162, 'comic': 228, 'books': 140, 'spoilers': 1065, 'here': 523, 'resolves': 947, 'huge': 549, 'cliffhanger': 219, 'round': 973, 'goes': 482, 'behind': 117, 'attraction': 81, 'again': 28, 'does': 319, 'character': 202, 'gives': 476, 'emergency': 350, 'signal': 1036, 'as': 77, 'brawl': 146, 'breaks': 151, 'out': 803, 'among': 54, 'guests': 499, 'once': 789, 'upon': 1233, 'studio': 1096, 'stills': 1082, 'feature': 404, 'classic': 217, 'characters': 203, 'live': 651, 'action': 9, 'surroundings': 1124, 'disneyland': 316, 'banned': 100, 'viral': 1256, 'tiktok': 1173, 'trend': 1203, 'still': 1081, 'features': 406, 'jonathan': 597, 'majors': 676, 'conqueror': 242, 'victor': 1246, 'timely': 1178, 'elderly': 345, 'man': 680, 'found': 445, 'collapsed': 224, 'bushes': 166, 'lebron': 631, 'james': 592, 'lakers': 616, 'preseason': 868, 'but': 168, 'healthy': 520, 'murders': 741, 'finale': 425, 'was': 1273, 'big': 129, 'hulu': 551, 'sony': 1054, 'pictures': 838, 'core': 252, 'ps': 891, 'adds': 20, 'new': 755, 'plus': 851, 'benefits': 122, 'early': 339, 'movie': 734, 'access': 4, 'another': 62, 'becoming': 112, 'streamed': 1088, 'title': 1180, 'times': 1179, 'mary': 688, 'poppins': 857, 'legend': 635, 'celebrates': 191, 'birthday': 133, 'crowds': 268, 'suffer': 1105, 'dangers': 275, 'florida': 433, 'virgin': 1257, 'river': 964, 'final': 424, 'batch': 102, 'promos': 886, 'minutes': 717, 'greatest': 493, 'vinicius': 1254, 'front': 453, 'page': 817, 'montage': 728, 'pinochius': 840, 'shocks': 1029, 'spanish': 1061, 'public': 893, 'real': 915, 'madrid': 672, 'take': 1130, 'pet': 835, 'sematary': 1009, 'bloodlines': 136, 'dick': 304, 'butkus': 169, 'nfl': 759, 'prolific': 883, 'dead': 280, 'fearsome': 403, 'hall': 503, 'fame': 396, 'chicago': 209, 'bears': 109, 'linebacker': 646, 'dies': 308, 'long': 658, 'awaited': 91, 'open': 792, 'ahead': 33, 'schedule': 990, 'apple': 72, 'ghosts': 471, 'thriller': 1166, 'cocaine': 222, 'bear': 108, 'darkness': 276, 'within': 1309, 'la': 614, 'luz': 668, 'del': 292, 'mundo': 740, 'when': 1292, 'golden': 484, 'bachelor': 94, 'burning': 165, 'abc': 0, 'boss': 142, 'casting': 189, 'hometowns': 541, 'fantasy': 401, 'suites': 1108, 'america': 51, 'be': 105, 'talking': 1133, 'taylor': 1135, 'swift': 1126, 'donates': 325, 'vip': 1255, 'eras': 366, 'tour': 1195, 'tickets': 1170, 'selena': 1008, 'gomez': 485, 'charity': 205, 'fund': 458, 'work': 1316, 'this': 1161, 'london': 657, 'event': 372, 'elemental': 346, 'more': 731, 'popular': 858, 'than': 1148, 'little': 650, 'mermaid': 706, 'guardians': 497, 'galaxy': 462, 'overall': 807, 'claims': 216, 'spot': 1069, 'week': 1283, 'tops': 1192, 'originals': 798, 'franchise': 447, 'coming': 230, 'linear': 645, 'networks': 753, 'movies': 735, 'soon': 1055, 'channels': 201, 'brooke': 160, 'an': 56, 'update': 1232, 'walt': 1265, 'tradition': 1198, 'animal': 59, 'crossing': 266, 'comes': 227, 'price': 873, 'hikes': 528, 'news': 757, 'odds': 777, 'ends': 357, 'park': 822, 'sued': 1104, 'woman': 1311, 'gynecologic': 501, 'injuries': 572, 'community': 233, 'rallies': 902, 'around': 74, 'contentious': 248, 'lawsuit': 627, 'schrodinger': 992, 'snow': 1045, 'white': 1297, 'actress': 14, 'rachel': 899, 'ziegler': 1339, 'creates': 263, 'controversy': 251, 'interview': 579, 'comments': 231, 'studios': 1097, 'scrapping': 993, 'everything': 375, 'do': 317, 'blade': 135, 'save': 984, 'universe': 1221, 'least': 630, 'his': 531, 'own': 811, 'show': 1032, 'turner': 1210, 'nets': 752, 'now': 773, 'share': 1021, 'rights': 962, 'air': 35, 'films': 422, 'brings': 154, 'stars': 1075, 'festival': 413, 'held': 521, 'captive': 181, 'unsafe': 1225, 'conditions': 237, 'spooktacular': 1067, 'love': 665, 'letter': 639, 'horror': 543, 'icon': 552, 'ranking': 905, 'wheel': 1291, 'lioness': 647, 'rise': 963, 'hotel': 546, 'guest': 498, 'suffers': 1106, 'attack': 79, 'huis': 550, 'ten': 1144, 'bosch': 141, 'wants': 1267, 'decision': 289, 'casino': 187, 'project': 881, 'breaking': 150, 'moana': 722, 'epcot': 361, 'zegler': 1338, 'head': 519, 'shared': 1022, 'under': 1219, 'deal': 282, 'ed': 341, 'sheeran': 1027, 'explains': 388, 'had': 502, 'grave': 491, 'dug': 334, 'property': 888, 'frozen': 454, 'stems': 1079, 'incredible': 567, 'director': 312, 'hilton': 529, 'families': 398, 'one': 790, 'ott': 800, 'sukanya': 1110, 'verma': 1241, 'her': 522, 'recommendations': 921, 'introduced': 583, 'incredibly': 568, 'important': 560, 'piece': 839, 'technology': 1139, 'prepares': 867, 'through': 1167, 'contemplates': 247, 'offering': 781, 'social': 1047, 'platforms': 847, 'bring': 152, 'app': 68, 'announced': 60, 'roll': 969, 'chris': 213, 'highlights': 527, 'spookiest': 1066, 'moments': 725, 'magic': 673, 'kingdom': 609, 'yearly': 1331, 'celebration': 193, 'solve': 1050, 'daily': 273, 'also': 47, 'play': 848, 'competitive': 235, 'learn': 629, 'each': 337, 'word': 1313, 'makes': 678, 'particularly': 825, 'different': 309, 'walkouts': 1264, 'private': 875, 'sector': 1002, 'workers': 1317, 'present': 869, 'difficult': 310, 'politics': 853, 'get': 469, 'sorted': 1057, 'way': 1280, 'avoids': 90, 'calamity': 174, 'biden': 128, 'administration': 21, 'proxy': 890, 'war': 1268, 'russia': 978, 'ukraine': 1217, 'appears': 71, 'been': 113, 'lost': 664, 'battlefield': 104, 'midst': 713, 'meth': 709, 'cough': 255, 'syrup': 1127, 'pagea': 818, 'shopdisney': 1030, 'dropped': 332, 'ton': 1187, 'merchandise': 705, 'couldn': 257, 'excited': 377, 'eyeing': 391, 'available': 89, 'wait': 1262, 'decorate': 291, 'let': 638, 'options': 795, 'think': 1159, 'wreaths': 1325, 'succeeded': 1102, 'scaling': 986, 'its': 591, 'business': 167, 'despite': 299, 'global': 478, 'giant': 472, 'consistently': 244, 'lowering': 666, 'subscription': 1100, 'costs': 253, 'country': 259, 'analysts': 57, 'alliancebernstein': 44, 'wrote': 1328, 'report': 939, 'clients': 218, 'thursday': 1168, 'streamer': 1089, 'predicted': 862, 'poor': 855, 'opening': 794, 'performance': 833, 'grab': 488, 'cute': 272, 'great': 492, 'over': 806, 'amazon': 50, 'additional': 19, 'details': 301, 'include': 565, 'give': 475, 'fan': 400, 'any': 64, 'kid': 607, 'lives': 652, 'high': 526, 'flying': 434, 'adventures': 25, 'gift': 473, 'full': 456, 'confirmed': 239, 'agents': 31, 'variance': 1238, 'authority': 87, 'got': 486, 'their': 1152, 'memories': 702, 'ouroboros': 802, 'played': 849, 'some': 1051, 'spectacular': 1062, 'innings': 573, 'odi': 778, 'cricket': 265, 'recent': 919, 'fever': 415, 'tested': 1146, 'before': 114, 'call': 175, 'taken': 1131, 'matter': 690, 'television': 1140, 'via': 1245, 'efl': 343, 'chionship': 212, 'pm': 852, 'et': 369, 'pt': 892, 'espn': 368, 'stream': 1087, 'nine': 765, 'member': 700, 'group': 496, 'always': 48, 'genre': 466, 'innovative': 574, 'bands': 98, 'album': 38, 'fact': 393, 'check': 208, 'doubles': 326, 'gloss': 479, 'packed': 815, 'sound': 1058, 'warning': 1271, 'smash': 1044, 'hit': 533, 'series': 1012, 'officially': 784, 'better': 125, 'ever': 373, 'looks': 662, 'titular': 1181, 'isn': 588, 'burdened': 164, 'debut': 286, 'mer': 704, 'dave': 277, 'filoni': 423, 'come': 226, 'reflect': 924, 'could': 256, 'very': 1244, 'well': 1288, 'best': 124, 'projects': 882, 'since': 1037, 'george': 468, 'lucas': 667, 'era': 365, 'sermon': 1013, 'gave': 464, 'kol': 611, 'nidrei': 760, 'sometimes': 1052, 'song': 1053, 'takes': 1132, 'residence': 945, 'brain': 144, 'happens': 508, 'ear': 338, 'worm': 1320, 'me': 696, 'car': 183, 'bit': 134, 'unusual': 1227, 'looking': 661, 'rc': 911, 'sale': 981, 'till': 1174, 'november': 772, 'made': 671, 'racer': 897, 'first': 431, 'iconic': 553, 'adorning': 22, 'emblematic': 349, 'models': 724, 'appeared': 70, 'fucking': 455, 'young': 1336, 'hashtag': 513, 'took': 1190, 'off': 780, 'bombshell': 139, 'exposés': 389, 'harvey': 511, 'weinstein': 1286, 'number': 774, 'industry': 571, 'feel': 408, 'progress': 880, 'wif': 1302, 'nonprofit': 769, 'organizatio': 797, 'ground': 495, 'running': 976, 'culminates': 269, 'twist': 1215, 'filled': 420, 'tone': 1188, 'second': 1000, 'outing': 805, 'stinger': 1083, 'means': 697, 'listen': 649, 'laufeyson': 622, 'unstuck': 1226, 'shows': 1033, 'finally': 426, 'returned': 952, 'importantly': 561, 'did': 305, 'chicken': 210, 'mcnuggets': 694, 'really': 916, 'exist': 382, 'unlikely': 1223, 'hero': 524, 'having': 517, 'rough': 972, 'few': 416, 'stage': 1072, 'exciting': 378, 'drops': 333, 'history': 532, 'if': 555, 'two': 1216, 'felt': 412, 'dream': 330, 'don': 323, 'worry': 1321, 'supposed': 1117, 'happening': 507, 'according': 6, 'executive': 381, 'producer': 877, 'kevin': 605, 'wright': 1326, 'crafting': 261, 'returns': 953, 'owen': 809, 'wilson': 1304, 'sophia': 1056, 'di': 303, 'martino': 686, 'surprise': 1121, 'broxton': 162, 'oklahoma': 785, 'comics': 229, 'raises': 901, 'interesting': 577, 'break': 147, 'know': 610, 'ravonna': 910, 'explain': 386, 'too': 1189, 'much': 736, 'audience': 84, 'feels': 409, 'carry': 185, 'tells': 1142, 'thewrap': 1156, 'contains': 246, 'try': 1206, 'ways': 1281, 'solo': 1049, 'month': 729, 'far': 402, 'premie': 864, 'yes': 1333, 'indeed': 569, 'reunites': 954, 'familiar': 397, 'face': 392, 'past': 827, 'jump': 600, 'resolve': 946, 'breakdown': 148, 'future': 460, 'robert': 966, 'niles': 764, 'ready': 914, 'scenes': 989, 'theme': 1154, 'attractions': 82, 'drop': 331, 'featured': 405, 'jung': 601, 'follow': 436, 'footsteps': 440, 'many': 682, 'beware': 127, 'performer': 834, 'alerted': 40, 'management': 681, 'fight': 418, 'broke': 158, 'parade': 820, 'viewing': 1251, 'bystander': 172, 'captured': 182, 'incident': 564, 'place': 844, 'paris': 821, 'resort': 948, 'para': 819, 'released': 927, 'handful': 505, 'featuring': 407, 'whole': 1299, 'host': 544, 'sharing': 1024, 'screen': 994, 'them': 1153, 'there': 1155, 'anything': 65, 'want': 1266, 'parks': 823, 'vacation': 1236, 'wrong': 1327, 'side': 1035, 'security': 1003, 'unfortunately': 1220, 'seems': 1005, 'though': 1164, 'several': 1019, 'people': 831, 'decided': 288, 'hard': 509, 'strict': 1092, 'meas': 698, 'said': 980, 'hasty': 515, 'changes': 200, 'production': 879, 'timeframe': 1176, 'fell': 411, 'ill': 556, 'riding': 960, 'twilight': 1214, 'zone': 1340, 'tower': 1196, 'terror': 1145, 'imagineers': 558, 'versions': 1243, 'worl': 1318, 'plan': 845, 'three': 1165, 'fall': 395, 'third': 1160, 'watched': 1276, 'based': 101, 'views': 1252, 'didn': 306, 'provide': 889, 'further': 459, 'confirm': 238, 'rema': 928, 'focused': 435, 'called': 176, 'launched': 624, 'playstation': 850, 'markets': 685, 'including': 566, 'canada': 179, 'rebrand': 918, 'previous': 872, 'bravia': 145, 'tvs': 1213, 'phones': 836, 'allow': 45, 'buy': 170, 'rent': 936, 'holder': 535, 'ozark': 814, 'list': 648, 'four': 446, 'seasons': 998, 'beloved': 120, 'actor': 13, 'starred': 1074, 'celebrating': 192, 'cementing': 195, 'oldest': 787, 'living': 653, 'age': 30, 'remain': 929, 'empty': 353, 'multiple': 737, 'experts': 385, 'warn': 1269, 'visiting': 1259, 'dangerous': 274, 'summer': 1111, 'unusually': 1228, 'tranquil': 1200, 'traditionally': 1199, 'landed': 618, 'september': 1011, 'result': 951, 'surpassed': 1119, 'finishes': 430, 'changed': 199, 'our': 801, 'goal': 480, 'preserving': 870, 'integrity': 576, 'hasn': 514, 'course': 260, 'accidents': 5, 'happen': 506, 'arrived': 76, 'promo': 885, 'videos': 1248, 'release': 926, 'seaso': 996, 'valencia': 1237, 'sports': 1068, 'newspaper': 758, 'superdeporte': 1113, 'left': 633, 'speechless': 1063, 'harsh': 510, 'depiction': 297, 'jr': 599, 'apos': 67, 'pinochiusapos': 841, 'edited': 342, 'screenwriter': 995, 'turned': 1209, 'boasts': 138, 'lengthy': 637, 'ip': 585, 'involvement': 584, 'trek': 1202, 'afforded': 26, 'row': 974, 'tarantino': 1134, 'expounding': 390, 'straight': 1086, 'rated': 908, 'former': 443, 'countless': 258, 'commercials': 232, 'died': 307, 'official': 783, 'cause': 190, 'death': 284, 'family': 399, 'tribune': 1204, 'peacefully': 828, 'ap': 66, 'middle': 712, 'team': 1136, 'journey': 598, 'water': 1278, 'inspired': 575, 'following': 437, 'steps': 1080, 'dwayne': 336, 'johnson': 595, 'pulling': 894, 'planning': 846, 'night': 762, 'couch': 254, 'picks': 837, 'newest': 756, 'multiversal': 738, 'last': 620, 'premiered': 866, 'launching': 626, 'old': 786, 'everyone': 374, 'strong': 1094, 'ratings': 909, 'million': 716, 'viewers': 1250, 'watching': 1277, 'delayed': 293, 'delivering': 295, 'strongest': 1095, 'john': 594, 'shearer': 1026, 'getty': 470, 'images': 557, 'needed': 748, 'large': 619, 'ticket': 1169, 'item': 590, 'rare': 907, 'impact': 559, 'benefit': 121, 'auction': 83, 'she': 1025, 'friend': 451, 'responded': 950, 'grammy': 489, 'winner': 1306, 'donated': 324, 'conc': 236, 'sees': 1007, 'hosting': 545, 'activation': 11, 'oxo': 813, 'south': 1059, 'bank': 99, 'would': 1323, 'rude': 975, 'attend': 80, 'teamed': 1137, 'auth': 86, 'slow': 1043, 'beginnings': 116, 'surprised': 1122, 'naysayers': 746, 'vol': 1260, 'version': 1242, 'related': 925, 'replace': 937, 'toy': 1197, 'breaker': 149, 'surpassing': 1120, 'chart': 206, 'began': 115, 'reporting': 942, 'part': 824, 'renegotiated': 934, 'multiyear': 739, 'co': 221, 'exclusively': 380, 'hope': 542, 'just': 602, 'suite': 1107, 'broadcast': 155, 'cable': 173, 'those': 1163, 'owned': 812, 'broadcasting': 156, 'read': 913, 'academy': 3, 'award': 92, 'join': 596, 'narrators': 745, 'candlelight': 180, 'processional': 876, 'revealed': 955, 'roster': 971, 'celebrity': 194, 'yea': 1329, 'non': 768, 'collaboration': 223, 'peacemaker': 829, 'dcu': 279, 'nerdist': 749, 'fun': 457, 'trip': 1205, 'into': 581, 'nightmare': 763, 'emma': 351, 'mcguinness': 693, 'recently': 920, 'filed': 419, 'resorts': 949, 'severe': 1020, 'claimed': 215, 'su': 1098, 'yet': 1334, 'support': 1116, 'refers': 923, 'backlash': 96, 'various': 1240, 'actions': 10, 'interviews': 580, 'while': 1296, 'promoting': 887, 'upcoming': 1231, 'remake': 931, 'revolves': 957, 'doing': 321, 'right': 961, 'thing': 1158, 'feige': 410, 'served': 1014, 'president': 871, 'subsidiary': 1101, 'company': 234, 'role': 968, 'guided': 500, 'superhero': 1114, 'powerhouse': 861, 'flag': 432, 'max': 692, 'am': 49, 'apparently': 69, 'else': 348, 'taika': 1128, 'watiti': 1279, 'battlebots': 103, 'chions': 211, 'discovery': 313, 'ninth': 766, 'thanks': 1149, 'domestic': 322, 'licensing': 642, 'agreement': 32, 'warner': 1270, 'bros': 161, 'kick': 606, 'later': 621, 'chances': 198, 'weekend': 1284, 'being': 118, 'overtaken': 808, 'marathon': 683, 'grew': 494, 'continues': 249, 'annual': 61, 'international': 578, 'festivals': 414, 'wraps': 1324, 'food': 439, 'wine': 1305, 'nov': 771, 'holidays': 538, 'runs': 977, 'dec': 287, 'polynesian': 854, 'village': 1253, 'devolved': 302, 'alleged': 43, 'forced': 442, 'room': 970, 'cast': 188, 'members': 701, 'located': 654, 'short': 1031, 'monorail': 726, 'ride': 959, 'ma': 669, 'wonderfully': 1312, 'documentary': 318, 'certain': 197, 'generation': 465, 'holds': 536, 'dear': 283, 'owes': 810, 'debt': 285, 'even': 371, 'never': 754, 'visited': 1258, 'existed': 383, 'sucks': 1103, 'along': 46, 'service': 1015, 'eleven': 347, 'fx': 461, 'freeform': 449, 'starting': 1076, 'reports': 943, 'deadline': 281, 'disne': 314, 'sept': 1010, 'racked': 900, 'billion': 131, 'viewed': 1249, 'across': 8, 'total': 1193, 'turn': 1208, 'dipped': 311, 'staying': 1078, 'expect': 384, 'magical': 674, 'lodging': 655, 'transportation': 1201, 'came': 177, 'unwelcome': 1229, 'souvenir': 1060, 'akihabara': 36, 'tokyo': 1184, 'kotaro': 612, 'takamura': 1129, 'sasebo': 983, 'nagasaki': 744, 'prefecture': 863, 'requested': 944, 'central': 196, 'government': 487, 'quick': 896, 'whether': 1294, 'grant': 490, 'license': 641, 'major': 675, 'walk': 1263, 'scheduled': 991, 'actually': 16, 'alert': 39, 'soft': 1048, 'allears': 42, 'net': 750, 'words': 1315, 'actresses': 15, 'perception': 832, 'arrangement': 75, 'renegotiating': 935, 'weird': 1287, 'morbid': 730, 'autumn': 88, 'variations': 1239, 'singer': 1038, 'exclusive': 379, 'reneg': 933, 'fine': 429, 'jennifer': 593, 'lee': 632, 'told': 1185, 'crowd': 267, 'blown': 137, 'away': 93, 'seen': 1006, 'wide': 1301, 'sandy': 982, 'beaches': 107, 'gentle': 467, 'surf': 1118, 'endless': 356, 'outdoor': 804, 'activities': 12, 'island': 587, 'carolina': 184, 'beach': 106, 'destinations': 300, 'fewer': 417, 'miles': 715, 'packs': 816, 'surprising': 1123, 'amount': 55, 'meet': 699, 'totally': 1194, 'killer': 608, 'fair': 394}
Encoded Document is:
[[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
...
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]]
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed). from pandas.core.computation.check import NUMEXPR_INSTALLED
# read data
x = pd.read_csv("../data/modified-data/Monthly_Stock.csv")
y = pd.read_csv("../data/modified-data/sp500_month.csv")
## select columns
x = x[['NFLX', 'AAPL', 'JPM']]
y = y[['x']]
# calculate correlation and plot
df_xy = pd.concat([x, y], axis=1)
sns.heatmap(df_xy.corr('pearson'), annot=True)
sns.pairplot(pd.DataFrame(np.hstack((x,y))))
<seaborn.axisgrid.PairGrid at 0x7fda206f8fd0>
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics import accuracy_score
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed). from pandas.core.computation.check import NUMEXPR_INSTALLED
df = pd.read_csv("../data/modified-data/cleaned_stock.csv")
df = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted', 'dn', 'mavg', 'up', 'Stock']]
df['Stock'] = df['Stock'].astype('category').cat.codes
print(df)
# Disney = 0, Paramount = 1, Warner Bros = 2
Open High Low Close Volume Adjusted \
0 155.830002 157.559998 155.360001 156.759995 10222800 156.759995
1 158.589996 160.320007 155.550003 155.729996 16582000 155.729996
2 156.520004 159.380005 155.100006 155.190002 12272100 155.190002
3 156.240005 157.770004 153.679993 156.899994 11095300 156.899994
4 156.899994 159.300003 156.289993 157.830002 9554600 157.830002
... ... ... ... ... ... ...
1375 10.010000 10.180000 9.720000 9.730000 32193700 9.730000
1376 9.700000 9.970000 9.570000 9.800000 18682700 9.800000
1377 9.800000 9.910000 9.530000 9.550000 15150700 9.550000
1378 9.650000 9.970000 9.630000 9.850000 18128400 9.850000
1379 9.890000 10.110000 9.800000 9.940000 23004200 9.940000
dn mavg up Stock
0 146.548151 151.926833 157.305514 0
1 146.628339 152.323833 158.019326 0
2 146.645134 152.596833 158.548532 0
3 146.630934 152.776666 158.922399 0
4 146.492148 153.018166 159.544185 0
... ... ... ... ...
1375 9.913337 10.476000 11.038663 2
1376 9.808119 10.424167 11.040215 2
1377 9.703007 10.362333 11.021660 2
1378 9.632098 10.324167 11.016236 2
1379 9.591074 10.298500 11.005926 2
[1380 rows x 10 columns]
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
# Load your dataset here
# Assuming df is your DataFrame
# Select the features and target
X = df[["Adjusted", "dn"]]
y = df["Stock"] # Replace "Stock" with your actual target column name
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("x_train.shape :",X_train.shape)
print("y_train.shape :",y_train.shape)
print("X_test.shape :",X_test.shape)
print("y_test.shape :",y_test.shape)
x_train.shape : (1104, 2) y_train.shape : (1104,) X_test.shape : (276, 2) y_test.shape : (276,)
# Initialize the MultinomialNB model
model = MultinomialNB()
# Train the model on the training data
model.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = model.predict(X_test)
# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Display the classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)
Accuracy: 0.5362318840579711
Classification Report:
precision recall f1-score support
0 0.57 0.75 0.65 93
1 0.00 0.00 0.00 95
2 0.51 0.89 0.65 88
accuracy 0.54 276
macro avg 0.36 0.55 0.43 276
weighted avg 0.35 0.54 0.42 276
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
def report(y,ypred):
#ACCURACY COMPUTE
print("Accuracy:",accuracy_score(y, ypred)*100)
print("Number of mislabeled points out of a total %d points = %d"
% (y.shape[0], (y != ypred).sum()))
def print_model_summary():
# LABEL PREDICTIONS FOR TRAINING AND TEST SET
yp_train = model.predict(X_train)
yp_test = model.predict(X_test)
print("ACCURACY CALCULATION\n")
print("TRAINING SET:")
report(y_train,yp_train)
print("\nTEST SET (UNTRAINED DATA):")
report(y_test,yp_test)
print("\nCHECK FIRST 20 PREDICTIONS")
print("TRAINING SET:")
print(y_train[0:20])
print(yp_train[0:20])
print("ERRORS:",yp_train[0:20]-y_train[0:20])
print("\nTEST SET (UNTRAINED DATA):")
print(y_test[0:20])
print(yp_test[0:20])
print("ERRORS:",yp_test[0:20]-y_test[0:20])
print_model_summary()
ACCURACY CALCULATION TRAINING SET: Accuracy: 48.731884057971016 Number of mislabeled points out of a total 1104 points = 566 TEST SET (UNTRAINED DATA): Accuracy: 53.62318840579711 Number of mislabeled points out of a total 276 points = 128 CHECK FIRST 20 PREDICTIONS TRAINING SET: 695 1 1088 2 1106 2 558 1 494 1 462 1 1074 2 243 0 936 2 654 1 756 1 1309 2 420 0 982 2 54 0 1089 2 783 1 109 0 774 1 571 1 Name: Stock, dtype: int8 [0 2 2 2 0 2 2 0 2 0 0 2 0 0 2 0 2 0 2 0] ERRORS: 695 -1 1088 0 1106 0 558 1 494 -1 462 1 1074 0 243 0 936 0 654 -1 756 -1 1309 0 420 0 982 -2 54 2 1089 -2 783 1 109 0 774 1 571 -1 Name: Stock, dtype: int8 TEST SET (UNTRAINED DATA): 377 0 548 1 979 2 1149 2 481 1 76 0 67 0 1096 2 584 1 824 1 1052 2 828 1 184 0 989 2 575 1 429 0 361 0 829 1 1239 2 196 0 Name: Stock, dtype: int8 [0 0 2 2 0 0 0 2 2 2 2 2 0 2 0 0 0 2 0 0] ERRORS: 377 0 548 -1 979 0 1149 0 481 -1 76 0 67 0 1096 0 584 1 824 1 1052 0 828 1 184 0 989 0 575 -1 429 0 361 0 829 1 1239 -2 196 0 Name: Stock, dtype: int8
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a heatmap of the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Class 0', 'Class 1', 'Class 2'], yticklabels=['Class 0', 'Class 1', 'Class 2'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
# Convert labels to a binary form (one-hot encoding)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
# Initialize the model and train it
model = MultinomialNB()
model.fit(X_train, y_train)
# Get class probabilities
y_prob = model.predict_proba(X_test)
# Compute ROC curve and AUC for each class
n_classes = y_test_bin.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Plot ROC curves for each class
plt.figure(figsize=(8, 6))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve (area = {roc_auc[i]:.2f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Multiclass')
plt.legend(loc="lower right")
plt.show()
# Define the number of iterations and an array to store accuracy values
n = 100
accuracy_values = []
# Load your data and labels here, replace with your actual data
for iteration in range(1, n + 1):
# Split the data into training and validation subsets for each iteration
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=iteration)
# Initialize the MultinomialNB model
model = MultinomialNB()
# Train the MultinomialNB model on the current subset of training data
model.fit(x_train, y_train)
# Make predictions on the validation data
y_pred = model.predict(x_valid)
# Calculate and store the accuracy for this time
accuracy = accuracy_score(y_valid, y_pred)
accuracy_values.append(accuracy)
# Plot the accuracy values
plt.figure(figsize=(10, 6))
plt.plot(range(1, n + 1), accuracy_values, marker='o')
plt.title('Accuracy')
plt.xlabel('Times')
plt.ylabel('Accuracy')
plt.grid()
plt.show()
n = 1000
# Store accuracy values in a list
accuracy_values = []
for _ in range(n):
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(100))
# Train the model on the training data
model.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = model.predict(X_test)
# Calculate and store the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy_values.append(accuracy)
# Plot the distribution of accuracy values
plt.figure(figsize=(10, 6))
plt.hist(accuracy_values, bins=20, edgecolor='black')
plt.title('Distribution of Accuracy')
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics import accuracy_score
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed). from pandas.core.computation.check import NUMEXPR_INSTALLED
#RELOAD FILE AND PRETEND THAT IS OUR STARTING POINT
df=pd.read_csv('../data/raw-data/text.csv')
print(df.shape)
#CONVERT FROM STRING LABELS TO INTEGERS
labels=[]; #y1=[]; y2=[]
y1=[]
for label in df["Label"]:
if label not in labels:
labels.append(label)
print("index =",len(labels)-1,": label =",label)
for i in range(0,len(labels)):
if(label==labels[i]):
y1.append(i)
y1=np.array(y1)
# CONVERT DF TO LIST OF STRINGS
corpus=df["Title"].to_list()
y2=df["Label"].to_numpy()
print("number of text chunks = ",len(corpus))
print(corpus[0:3])
(4998, 2) index = 0 : label = Netflix index = 1 : label = Hulu index = 2 : label = Prime Video index = 3 : label = Disney+ number of text chunks = 4998 ['Breaking Bad', 'Stranger Things', 'Better Call Saul']
# INITIALIZE COUNT VECTORIZER
# minDF = 0.01 means "ignore terms that appear in less than 1% of the documents".
# minDF = 5 means "ignore terms that appear in less than 5 documents".
vectorizer=CountVectorizer(min_df=0.001)
# RUN COUNT VECTORIZER ON OUR COURPUS
Xs = vectorizer.fit_transform(corpus)
X=np.array(Xs.todense())
#CONVERT TO ONE-HOT VECTORS
maxs=np.max(X,axis=0)
X=np.ceil(X/maxs)
# DOUBLE CHECK
print(X.shape,y1.shape,y2.shape)
print("DATA POINT-0:",X[0,0:10],"y1 =",y1[0]," y2 =",y2[0])
(4998, 477) (4998,) (4998,) DATA POINT-0: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] y1 = 0 y2 = Netflix
Assignment 3.2.4: Break data into an 80-20 training/test set
As a sanity check, reprint the shapes to make sure everything is correct
x_train.shape : (120, 4)
y_train.shape : (120,)
X_test.shape : (30, 4)
y_test.shape : (30,)
# BEFORE SPLIT
print(y1[1000:1200])
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# INSERT CODE TO PARTITION DATASET INTO TRAINING-TEST
from sklearn.model_selection import train_test_split
test_ratio=0.2
# SPLIT ARRAYS OR MATRICES INTO RANDOM TRAIN AND TEST SUBSETS.
x_train, x_test, y_train, y_test = train_test_split(X, y1, test_size=test_ratio, random_state=0)
y_train=y_train.flatten()
y_test=y_test.flatten()
print("x_train.shape :",x_train.shape)
print("y_train.shape :",y_train.shape)
print("X_test.shape :",x_test.shape)
print("y_test.shape :",y_test.shape)
x_train.shape : (3998, 477) y_train.shape : (3998,) X_test.shape : (1000, 477) y_test.shape : (1000,)
#CHECK TO MAKE SURE IT WAS RANDOMIZED
print(y_train[0:100])
[3 0 2 1 0 2 3 2 2 2 2 2 2 1 0 0 0 1 2 2 2 0 1 1 2 1 1 2 2 3 2 2 1 0 3 3 1 2 2 0 0 1 1 1 0 1 2 2 0 1 0 2 2 2 2 3 0 1 3 2 2 0 0 0 2 1 0 2 1 0 1 0 0 1 2 1 0 2 1 2 0 1 0 0 1 2 2 2 2 1 0 2 1 2 0 1 0 3 2 2]
def report(y,ypred):
#ACCURACY COMPUTE
print("Accuracy:",accuracy_score(y, ypred)*100)
print("Number of mislabeled points out of a total %d points = %d"
% (y.shape[0], (y != ypred).sum()))
def print_model_summary():
# LABEL PREDICTIONS FOR TRAINING AND TEST SET
yp_train = model.predict(x_train)
yp_test = model.predict(x_test)
print("ACCURACY CALCULATION\n")
print("TRAINING SET:")
report(y_train,yp_train)
print("\nTEST SET (UNTRAINED DATA):")
report(y_test,yp_test)
print("\nCHECK FIRST 20 PREDICTIONS")
print("TRAINING SET:")
print(y_train[0:20])
print(yp_train[0:20])
print("ERRORS:",yp_train[0:20]-y_train[0:20])
print("\nTEST SET (UNTRAINED DATA):")
print(y_test[0:20])
print(yp_test[0:20])
print("ERRORS:",yp_test[0:20]-y_test[0:20])
The following code applies the multi-nomial Naive Bayes classifier to the text data-set generated with wikipeda
from sklearn.naive_bayes import MultinomialNB
# INITIALIZE MODEL
model = MultinomialNB()
# TRAIN MODEL
model.fit(x_train,y_train)
# PRINT REPORT USING UTILITY FUNCTION ABOVE
print_model_summary()
ACCURACY CALCULATION TRAINING SET: Accuracy: 48.049024512256125 Number of mislabeled points out of a total 3998 points = 2077 TEST SET (UNTRAINED DATA): Accuracy: 41.099999999999994 Number of mislabeled points out of a total 1000 points = 589 CHECK FIRST 20 PREDICTIONS TRAINING SET: [3 0 2 1 0 2 3 2 2 2 2 2 2 1 0 0 0 1 2 2] [3 0 1 0 2 0 0 2 0 2 2 2 0 0 0 0 0 1 2 0] ERRORS: [ 0 0 -1 -1 2 -2 -3 0 -2 0 0 0 -2 -1 0 0 0 0 0 -2] TEST SET (UNTRAINED DATA): [2 1 0 0 1 0 0 2 2 0 3 0 2 2 0 1 3 1 0 0] [2 1 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0] ERRORS: [ 0 0 0 0 0 0 0 -2 -2 0 -3 0 0 -2 0 -1 -3 -1 0 0]
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
# Generate a classification report
report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
print("Classification Report:\n", report)
# Generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a heatmap of the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
Classification Report:
precision recall f1-score support
Class 0 0.45 0.79 0.57 366
Class 1 0.50 0.26 0.34 259
Class 2 0.60 0.38 0.46 322
Class 3 0.88 0.28 0.43 53
accuracy 0.49 1000
macro avg 0.61 0.43 0.45 1000
weighted avg 0.53 0.49 0.47 1000
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
# Convert labels to a binary form (one-hot encoding)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3])
# Initialize the model and train it
model = MultinomialNB()
model.fit(x_train, y_train)
# Get class probabilities
y_prob = model.predict_proba(x_test)
# Compute ROC curve and AUC for each class
n_classes = y_test_bin.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Plot ROC curves for each class
plt.figure(figsize=(8, 6))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve (area = {roc_auc[i]:.2f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Multiclass')
plt.legend(loc="lower right")
plt.show()
# Define the number of times and an array to store accuracy values
n = 100
accuracy_values = []
# Load your data and labels here, replace with your actual data
for iteration in range(1, n + 1):
# Split the data into training and validation subsets for each iteration
x_train, x_valid, y_train, y_valid = train_test_split(X, y1, test_size=0.2, random_state=iteration)
# Initialize the MultinomialNB model
model = MultinomialNB()
# Train the MultinomialNB model on the current subset of training data
model.fit(x_train, y_train)
# Make predictions on the validation data
y_pred = model.predict(x_valid)
# Calculate and store the accuracy for this times
accuracy = accuracy_score(y_valid, y_pred)
accuracy_values.append(accuracy)
# Plot the accuracy values
plt.figure(figsize=(10, 6))
plt.plot(range(1, n + 1), accuracy_values, marker='o')
plt.title('Accuracy')
plt.xlabel('Times')
plt.ylabel('Accuracy')
plt.grid()
plt.show()
n = 1000
# Store accuracy values in a list
accuracy_values = []
for _ in range(n):
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2, random_state=np.random.randint(100))
# Train the model on the training data
model.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = model.predict(X_test)
# Calculate and store the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy_values.append(accuracy)
# Plot the distribution of accuracy values
plt.figure(figsize=(10, 6))
plt.hist(accuracy_values, bins=20, edgecolor='black')
plt.title('Distribution of Accuracy')
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
format: html: embed-resources: true code-fold: true
# import the necessary packages
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
# read data
df = pd.read_csv("../data/modified-data/stock_cluster.csv")
df.head()
| Unnamed: 0 | Date | Open | High | Low | Close | Volume | Adjusted | dn | mavg | up | Stock | direction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022-01-03 | 2022-01-03 | 605.609985 | 609.989990 | 590.559998 | 597.369995 | 3067500 | 597.369995 | 587.291452 | 607.250833 | 627.210215 | NFLX | Decreasing |
| 1 | 2022-01-04 | 2022-01-04 | 599.909973 | 600.409973 | 581.599976 | 591.150024 | 4393100 | 591.150024 | 585.186408 | 606.287166 | 627.387925 | NFLX | Decreasing |
| 2 | 2022-01-05 | 2022-01-05 | 592.000000 | 592.840027 | 566.880005 | 567.520020 | 4148700 | 567.520020 | 580.284351 | 603.976666 | 627.668982 | NFLX | Decreasing |
| 3 | 2022-01-06 | 2022-01-06 | 554.340027 | 563.359985 | 542.010010 | 553.289978 | 5711800 | 553.289978 | 570.018038 | 600.225332 | 630.432625 | NFLX | Decreasing |
| 4 | 2022-01-07 | 2022-01-07 | 549.460022 | 553.429993 | 538.219971 | 541.059998 | 3382900 | 541.059998 | 558.782412 | 596.575831 | 634.369250 | NFLX | Decreasing |
# define X and Y
from sklearn.preprocessing import StandardScaler
X = df[['Adjusted', 'Volume']]
print(X.head())
X = StandardScaler().fit_transform(X)
Y = df[['Stock']]
print(Y.head())
Adjusted Volume 0 597.369995 3067500 1 591.150024 4393100 2 567.520020 4148700 3 553.289978 5711800 4 541.059998 3382900 Stock 0 NFLX 1 NFLX 2 NFLX 3 NFLX 4 NFLX
# visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Adjusted', y='Volume', hue='Stock')
plt.xlabel('Adjusted')
plt.ylabel('Volume')
plt.title('Cluster Plot')
plt.legend(loc = "upper right", title = "Stock")
plt.show()
# import relevent libraries for clustering. we will use KMeans, AgglomerativeClustering, MeanShift, Birch, and DBSCAN
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
import sklearn.cluster
# THIS WILL ITERATE OVER ONE HYPER-PARAMETER (GRID SEARCH)
# AND RETURN THE CLUSTER RESULT THAT OPTIMIZES THE SILHOUETTE SCORE
def maximize_silhouette(X,algo="birch",nmax=20,i_plot=False):
# PARAM
i_print=False
#FORCE CONTIGUOUS
X=np.ascontiguousarray(X)
# LOOP OVER HYPER-PARAM
params=[]; sil_scores=[]
sil_max=-10
for param in range(2,nmax+1):
if(algo=="birch"):
model = sklearn.cluster.Birch(n_clusters=param).fit(X)
labels=model.predict(X)
if(algo=="ag"):
model = sklearn.cluster.AgglomerativeClustering(n_clusters=param).fit(X)
labels=model.labels_
if(algo=="dbscan"):
param=0.05*(param-1)
model = sklearn.cluster.DBSCAN(eps=param).fit(X)
labels=model.labels_
if(algo=="kmeans"):
model = sklearn.cluster.KMeans(n_clusters=param).fit(X)
labels=model.predict(X)
if(algo=="meanshift"):
model = sklearn.cluster.MeanShift(bandwidth=param).fit(X)
labels=model.labels_
try:
sil_scores.append(sklearn.metrics.silhouette_score(X,labels))
params.append(param)
except:
continue
if(i_print): print(param,sil_scores[-1])
if(sil_scores[-1]>sil_max):
opt_param=param
sil_max=sil_scores[-1]
opt_labels=labels
print("Algorithm = ", algo)
print("OPTIMAL PARAMETER =",opt_param)
if(i_plot):
fig, ax = plt.subplots()
ax.plot(params, sil_scores, "-o")
ax.set(xlabel='Hyper-parameter', ylabel='Silhouette')
plt.show()
return opt_labels
# AGGLOMERATIVE CLUSTERING
opt_labels=maximize_silhouette(X,algo="ag",nmax=15, i_plot=True)
# DBSCAN
opt_labels=maximize_silhouette(X,algo="dbscan",nmax=15, i_plot=True)
# BRICH
opt_labels=maximize_silhouette(X,algo="birch",nmax=15, i_plot=True)
# KMEANS
opt_labels=maximize_silhouette(X,algo="kmeans",nmax=15, i_plot=True)
Algorithm = ag OPTIMAL PARAMETER = 3
Algorithm = dbscan OPTIMAL PARAMETER = 0.35000000000000003
Algorithm = birch OPTIMAL PARAMETER = 4
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_birch.py:725: ConvergenceWarning: Number of subclusters found (11) by BIRCH is less than (12). Decrease the threshold. warnings.warn( /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_birch.py:725: ConvergenceWarning: Number of subclusters found (11) by BIRCH is less than (13). Decrease the threshold. warnings.warn( /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_birch.py:725: ConvergenceWarning: Number of subclusters found (11) by BIRCH is less than (14). Decrease the threshold. warnings.warn( /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_birch.py:725: ConvergenceWarning: Number of subclusters found (11) by BIRCH is less than (15). Decrease the threshold. warnings.warn(
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Algorithm = kmeans OPTIMAL PARAMETER = 3
## kmeans
from scipy.spatial.distance import cdist
distortions = []
inertias = []
for i in range(1, 11):
# Initialize and fit the KMeans model
kmeans = KMeans(n_clusters = i)
kmeans.fit(X)
# Calculate distortion and inertia
inertias.append(kmeans.inertia_)
distortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'),axis=1)) / X.shape[0])
# Create a DataFrame to store the data
data = pd.DataFrame({'Cluster': range(1, 11), 'Distortion': distortions, 'Inertia': inertias})
data
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
| Cluster | Distortion | Inertia | |
|---|---|---|---|
| 0 | 1 | 1.268869 | 2862.000000 |
| 1 | 2 | 0.855398 | 1407.136470 |
| 2 | 3 | 0.463472 | 493.295513 |
| 3 | 4 | 0.391171 | 332.985543 |
| 4 | 5 | 0.330239 | 237.692843 |
| 5 | 6 | 0.292370 | 183.698590 |
| 6 | 7 | 0.255055 | 138.253996 |
| 7 | 8 | 0.238272 | 119.543276 |
| 8 | 9 | 0.227034 | 101.652802 |
| 9 | 10 | 0.212766 | 89.814633 |
# plot distortion and inertia for kmeans, you can either plot them seperately or use fig, ax = plt.subplots(1, 2) to plot them in the same figure. Suggest the optimal number of clusters based on the plot.
# Create two subplots, sharing the x-axis
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
# Plot Distortion in the top subplot
ax1.plot(data['Cluster'], data['Distortion'], linestyle='-', label='Distortion', color='green')
# Plot Inertia in the bottom subplot
ax2.plot(data['Cluster'], data['Inertia'], linestyle='-', label='Inertia', color='orange')
ax2.set_xlabel('Cluster')
# Add a legend to both subplots
ax1.legend()
ax2.legend()
plt.show()
# Perform kmeans
kmeans = KMeans(n_clusters = 3)
cluster_labels = kmeans.fit_predict(X)
# Add the cluster labels to your DataFrame
df['cluster_labels'] = cluster_labels
# Visualize the clusters
plt.figure(figsize=(10, 6))
# Plot the data points with different colors for each cluster
for cluster_label in df['cluster_labels'].unique():
cluster_data = df[df['cluster_labels'] == cluster_label]
plt.scatter(
cluster_data['Adjusted'],
cluster_data['Volume'],
label=f'Cluster {cluster_label}',
)
plt.xlabel('Adjusted')
plt.ylabel('Volume')
plt.title('Kmeans Clustering')
plt.legend()
plt.show()
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
from sklearn.metrics import silhouette_samples, silhouette_score
# Calculate silhouette scores
silhouette_vals = silhouette_samples(X, cluster_labels)
silhouette_avg = silhouette_score(X, cluster_labels)
# Create Silhouette plot
plt.figure(figsize=(10, 6))
y_lower = 10
for i in range(3):
ith_cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
ith_cluster_silhouette_vals.sort()
size_cluster_i = ith_cluster_silhouette_vals.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.rcParams['axes.prop_cycle'].by_key()['color'][i]
plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for the next plot
y_lower = y_upper + 10 # 10 for the 0 samples
plt.title('Silhouette Plot for K-Means Clustering')
plt.xlabel('Silhouette coefficient values')
plt.ylabel('Cluster label')
# The vertical line for average silhouette score of all the values
plt.axvline(x=silhouette_avg, color="red", linestyle="--", linewidth=2)
plt.yticks([]) # Clear the yaxis labels / ticks
plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.show()
# perform DBSCAN clustering. use the eps and min_samples parameters to find the optimal number of clusters. plot the number of clusters vs the silhouette score. Suggest the optimal number of clusters based on the plot.
from sklearn.metrics import silhouette_score
# Initialize an empty list to store silhouette scores
silhouette_scores = []
# Define a range of values for eps and min_samples
eps_range = np.linspace(0.05, 1.0, num=20)
min_samples_range = range(1, 11)
# Perform DBSCAN clustering for different combinations of eps and min_samples
for eps in eps_range:
for min_samples in min_samples_range:
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(X) # Replace X with your data
num_clusters = len(np.unique(labels)) - 1 # Subtract 1 to account for noise points
if num_clusters > 1: # Ensure that at least two clusters are formed
silhouette = silhouette_score(X, labels)
silhouette_scores.append((eps, min_samples, num_clusters, silhouette))
# Convert the list of scores to a NumPy array for easier manipulation
silhouette_scores = np.array(silhouette_scores)
# Plot the number of clusters vs. silhouette score
plt.figure(figsize=(10, 6))
plt.scatter(silhouette_scores[:, 2], silhouette_scores[:, 3], c=silhouette_scores[:, 0], cmap='viridis', s=50)
plt.colorbar(label='Epsilon (eps)')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Number of Clusters vs. Silhouette Score for DBSCAN')
#plt.grid(True)
# Find the combination of eps and min_samples with the highest silhouette score
best_score_idx = np.argmax(silhouette_scores[:, 3])
best_eps = silhouette_scores[best_score_idx, 0]
best_min_samples = silhouette_scores[best_score_idx, 1]
best_num_clusters = silhouette_scores[best_score_idx, 2]
best_silhouette_score = silhouette_scores[best_score_idx, 3]
print(f"Best combination: eps = {best_eps}, min_samples = {best_min_samples}")
print(f"Number of clusters: {best_num_clusters}")
print(f"Best Silhouette Score: {best_silhouette_score}")
plt.show()
Best combination: eps = 0.1, min_samples = 9.0 Number of clusters: 4.0 Best Silhouette Score: 0.4658880301489884
# Perform DBSCAN
dbscan = DBSCAN(eps=0.1, min_samples=9)
cluster_labels = dbscan.fit_predict(X)
# Add the cluster labels to your DataFrame
df['cluster_labels'] = cluster_labels
# Visualize the clusters
plt.figure(figsize=(10, 6))
# Plot the data points with different colors for each cluster
for cluster_label in df['cluster_labels'].unique():
cluster_data = df[df['cluster_labels'] == cluster_label]
plt.scatter(
cluster_data['Adjusted'],
cluster_data['Volume'],
label=f'Cluster {cluster_label}',
)
plt.xlabel('Adjusted')
plt.ylabel('Volume')
plt.title('DBSCAN Clustering')
plt.legend()
plt.show()
# Calculate silhouette scores
silhouette_vals = silhouette_samples(X, cluster_labels)
silhouette_avg = silhouette_score(X, cluster_labels)
# Create Silhouette plot
plt.figure(figsize=(10, 6))
y_lower = 10
for i in range(-1, 4):
ith_cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
ith_cluster_silhouette_vals.sort()
size_cluster_i = ith_cluster_silhouette_vals.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.rcParams['axes.prop_cycle'].by_key()['color'][i+1]
plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for the next plot
y_lower = y_upper + 10 # 10 for the 0 samples
plt.title('Silhouette Plot for K-Means Clustering')
plt.xlabel('Silhouette coefficient values')
plt.ylabel('Cluster label')
# The vertical line for average silhouette score of all the values
plt.axvline(x=silhouette_avg, color="red", linestyle="--", linewidth=2)
plt.yticks([]) # Clear the yaxis labels / ticks
plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.show()
# Perform Hierarchical clustering
agg_clustering = AgglomerativeClustering(n_clusters=3) # Specify the number of clusters
cluster_labels = agg_clustering.fit_predict(X)
# Add the cluster labels to your DataFrame
df['cluster_labels'] = cluster_labels
# Visualize the clusters
plt.figure(figsize=(10, 6))
# Plot the data points with different colors for each cluster
for cluster_label in df['cluster_labels'].unique():
cluster_data = df[df['cluster_labels'] == cluster_label]
plt.scatter(
cluster_data['Adjusted'],
cluster_data['Volume'],
label=f'Cluster {cluster_label}',
)
plt.xlabel('Adjusted')
plt.ylabel('Volume')
plt.title('Hierarchical Clustering')
plt.legend()
plt.show()
# create linkage for agglomerative clustering, and the dendrogram for the linkage. Suggest the optimal number of clusters based on the dendrogram.
from scipy.cluster.hierarchy import dendrogram, linkage
# Calculate the linkage matrix
linkage_matrix = linkage(X, method='ward') # You can choose a different linkage method
# Create and visualize the dendrogram
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix, truncate_mode='level')
plt.axhline(y=20, color='r', linestyle='--')
plt.show()
# Calculate silhouette scores
silhouette_vals = silhouette_samples(X, cluster_labels)
silhouette_avg = silhouette_score(X, cluster_labels)
# Create Silhouette plot
plt.figure(figsize=(10, 6))
y_lower = 10
for i in range(3):
ith_cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
ith_cluster_silhouette_vals.sort()
size_cluster_i = ith_cluster_silhouette_vals.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.rcParams['axes.prop_cycle'].by_key()['color'][i]
plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for the next plot
y_lower = y_upper + 10 # 10 for the 0 samples
plt.title('Silhouette Plot for Hierarchical Clustering')
plt.xlabel('Silhouette coefficient values')
plt.ylabel('Cluster label')
# The vertical line for average silhouette score of all the values
plt.axvline(x=silhouette_avg, color="red", linestyle="--", linewidth=2)
plt.yticks([]) # Clear the yaxis labels / ticks
plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.show()
format: html: embed-resources: true code-fold: true
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
# read data
df = pd.read_csv("../data/modified-data/stock_cluster.csv")
df.head()
| Unnamed: 0 | Date | Open | High | Low | Close | Volume | Adjusted | dn | mavg | up | Stock | direction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022-01-03 | 2022-01-03 | 605.609985 | 609.989990 | 590.559998 | 597.369995 | 3067500 | 597.369995 | 587.291452 | 607.250833 | 627.210215 | NFLX | Decreasing |
| 1 | 2022-01-04 | 2022-01-04 | 599.909973 | 600.409973 | 581.599976 | 591.150024 | 4393100 | 591.150024 | 585.186408 | 606.287166 | 627.387925 | NFLX | Decreasing |
| 2 | 2022-01-05 | 2022-01-05 | 592.000000 | 592.840027 | 566.880005 | 567.520020 | 4148700 | 567.520020 | 580.284351 | 603.976666 | 627.668982 | NFLX | Decreasing |
| 3 | 2022-01-06 | 2022-01-06 | 554.340027 | 563.359985 | 542.010010 | 553.289978 | 5711800 | 553.289978 | 570.018038 | 600.225332 | 630.432625 | NFLX | Decreasing |
| 4 | 2022-01-07 | 2022-01-07 | 549.460022 | 553.429993 | 538.219971 | 541.059998 | 3382900 | 541.059998 | 558.782412 | 596.575831 | 634.369250 | NFLX | Decreasing |
# define X
X = df[['Open', 'High','Low', 'Close', 'Volume', 'Adjusted', 'dn', 'mavg', 'up']]
print(X.head())
Open High Low Close Volume Adjusted \
0 605.609985 609.989990 590.559998 597.369995 3067500 597.369995
1 599.909973 600.409973 581.599976 591.150024 4393100 591.150024
2 592.000000 592.840027 566.880005 567.520020 4148700 567.520020
3 554.340027 563.359985 542.010010 553.289978 5711800 553.289978
4 549.460022 553.429993 538.219971 541.059998 3382900 541.059998
dn mavg up
0 587.291452 607.250833 627.210215
1 585.186408 606.287166 627.387925
2 580.284351 603.976666 627.668982
3 570.018038 600.225332 630.432625
4 558.782412 596.575831 634.369250
# pair plot
sns.pairplot(X)
plt.show()
plt.figure(figsize=(10, 6))
sns.heatmap(X.corr(), annot=True)
plt.title('Correlation Heatmap')
plt.show()
pca = PCA()
pca.fit(X)
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
pca.explained_variance_ratio_.cumsum(), marker='o')
plt.title('Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()
# PCA
X = StandardScaler().fit_transform(X)
pca = PCA(n_components=2) # 2 dimension
pca.fit(X)
newX=pca.fit_transform(X)
print(pca.explained_variance_ratio_)
print(newX)
[0.89109103 0.09554449] [[10.85433149 0.68579939] [10.71167546 0.70664221] [10.41984188 0.66110329] ... [-1.50904694 -0.95411545] [-1.48990881 -1.00794026] [-1.47147694 -1.05500015]]
# Create a scatter plot of the first two attributes of the original data
plt.figure(figsize=(10, 6))
# Perform PCA
pca = PCA(n_components=2)
newX = pca.fit_transform(X)
plt.scatter(newX[:, 0], newX[:, 1], alpha=0.8)
plt.title('PCA Visualization')
# Plotting the principal components
for i, (x, y) in enumerate(pca.components_.T):
plt.arrow(0, 0, x, y, color='r', alpha=0.8)
plt.text(x, y, f'Attribute_{i+1}', color='g', fontsize=8)
plt.show()
# Create a scatter plot of the original data
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], alpha=0.8, label='Original Data')
plt.title('Original Data')
# Perform PCA
pca = PCA(n_components=2)
newX = pca.fit_transform(X)
# Create a scatter plot of the transformed data after PCA
plt.subplot(1, 2, 2)
plt.scatter(newX[:, 0], newX[:, 1], alpha=0.8, label='Transformed Data')
plt.title('Data After PCA')
# Plotting the principal components (if needed)
plt.quiver(0, 0, pca.components_[0, 0], pca.components_[0, 1], angles='xy', scale_units='xy', scale=0.5, color='r')
plt.quiver(0, 0, pca.components_[1, 0], pca.components_[1, 1], angles='xy', scale_units='xy', scale=0.5, color='b')
plt.legend()
plt.tight_layout()
plt.show()
# Create a scatter plot of the first two attributes of the original data
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 4], alpha=0.8, label='Original Data (Attributes 1 and 2)')
plt.title('Original Data')
# Perform PCA
pca = PCA(n_components=2)
newX = pca.fit_transform(X)
# Create a scatter plot of the transformed data after PCA
plt.subplot(1, 2, 2)
plt.scatter(newX[:, 0], newX[:, 1], alpha=0.8, label='Transformed Data')
plt.title('Data After PCA')
# Plotting the principal components (if needed)
for i in range(pca.components_.shape[0]):
plt.quiver(0, 0, pca.components_[i, 0], pca.components_[i, 1],
angles='xy', scale_units='xy', scale=0.5, color='r')
plt.legend()
plt.tight_layout()
plt.show()
kl_divergence = []
# Iterate over a range of components to fit t-SNE models
for n_components in range(1, min(X.shape[1], 4)):
tsne = TSNE(n_components=n_components, random_state=42, method='barnes_hut')
X_tsne = tsne.fit_transform(X)
kl_divergence.append(tsne.kl_divergence_)
# Plot KL Divergence for each number of components
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(kl_divergence) + 1), kl_divergence, marker='o')
plt.title('KL Divergence by Number of Components in t-SNE (barnes_hut)')
plt.xlabel('Number of Components')
plt.ylabel('KL Divergence')
plt.show()
# Create a t-SNE model with 2 components
tsne = TSNE(n_components=2)
# Fit and transform the data
X_tsne = tsne.fit_transform(X)
# Create a scatter plot of the t-SNE-transformed data
plt.figure(figsize=(10, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.8)
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()
from mpl_toolkits.mplot3d import Axes3D
tsne_3d = TSNE(n_components=3)
X_tsne_3d = tsne_3d.fit_transform(X)
fig = plt.figure(figsize=(20, 14))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_tsne_3d[:, 0], X_tsne_3d[:, 1], X_tsne_3d[:, 2], alpha=0.8)
ax.set_title('t-SNE 3D Visualization')
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
plt.show()
# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# Perform t-SNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
# Scatter plot for PCA
axes[0].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.8)
axes[0].set_title('PCA Visualization')
axes[0].set_xlabel('PCA Component 1')
axes[0].set_ylabel('PCA Component 2')
# Scatter plot for t-SNE
axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.8)
axes[1].set_title('t-SNE Visualization')
axes[1].set_xlabel('t-SNE Component 1')
axes[1].set_ylabel('t-SNE Component 2')
plt.tight_layout()
plt.show()
format: html: embed-resources: true code-fold: true
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from IPython.display import Image
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
The following code will import the data file into a pandas data-frame
# LOAD THE DATAFRAME
from sklearn.datasets import load_breast_cancer
(x,y) = load_breast_cancer(return_X_y=True,as_frame=True)
df=pd.concat([x,y],axis=1)
# LOOK AT FIRST ROW
print(df.iloc[0])
mean radius 17.990000 mean texture 10.380000 mean perimeter 122.800000 mean area 1001.000000 mean smoothness 0.118400 mean compactness 0.277600 mean concavity 0.300100 mean concave points 0.147100 mean symmetry 0.241900 mean fractal dimension 0.078710 radius error 1.095000 texture error 0.905300 perimeter error 8.589000 area error 153.400000 smoothness error 0.006399 compactness error 0.049040 concavity error 0.053730 concave points error 0.015870 symmetry error 0.030030 fractal dimension error 0.006193 worst radius 25.380000 worst texture 17.330000 worst perimeter 184.600000 worst area 2019.000000 worst smoothness 0.162200 worst compactness 0.665600 worst concavity 0.711900 worst concave points 0.265400 worst symmetry 0.460100 worst fractal dimension 0.118900 target 0.000000 Name: 0, dtype: float64
df = pd.read_csv("../data/modified-data/stock_cluster.csv")
df = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted', 'dn', 'mavg', 'up', 'Stock']]
# # RUN THE FOLLOWING CODE TO GENERATE A SEABORN PAIRPLOT
tmp=pd.concat([df.sample(n=10,axis=1),y],axis=1)
print(tmp.shape)
sns.pairplot(tmp,hue="target", diag_kind='kde')
plt.show()
(1431, 11)
# Convert the stock to categorical
df['Stock'] = df['Stock'].astype('category')
# Encode the stock as numeric using cat.codes
df['Stock'] = df['Stock'].cat.codes
df.rename(columns={"Stock": 'target'}, inplace=True)
# INSERT CODE TO PRINT ITS SHAPE AND COLUMN NAMES
print(df.shape)
print(df.columns)
(1431, 10)
Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted', 'dn', 'mavg',
'up', 'target'],
dtype='object')
We will be using y="target" (output target) and all other remaining columns as our X (input feature) matrix.
Before doing analysis it is always good to "get inside" the data and see what we are working with
#INSERT CODE TO PRINT THE FOLLOWING DATA-FRAME WHICH SUMMARIZES EACH COLUMN
summary_stats = df.describe().transpose()[['min', 'mean', 'max']]
dtypes = df.dtypes
summary = pd.concat([dtypes, summary_stats], axis=1)
summary.columns = ['dtypes', 'min', 'mean', 'max']
print(summary)
dtypes min mean max Open float64 1.018000e+02 2.090672e+02 6.056100e+02 High float64 1.045400e+02 2.120809e+02 6.099900e+02 Low float64 1.012800e+02 2.060704e+02 5.905600e+02 Close float64 1.019600e+02 2.091259e+02 5.973700e+02 Volume int64 1.404700e+06 3.198843e+07 1.826020e+08 Adjusted float64 9.900871e+01 2.076091e+02 5.973700e+02 dn float64 9.866728e+01 1.920277e+02 5.872915e+02 mavg float64 1.086962e+02 2.100733e+02 6.072508e+02 up float64 1.161748e+02 2.281188e+02 6.778163e+02 target int8 0.000000e+00 1.000000e+00 2.000000e+00
# INSERT CODE TO EXPLORE THE LOAD BALANCE AND COUNT THE NUMBER OF SAMPLES FOR EACH TARGET (THEN PRINT THE RESULT)
target_counts = df['target'].value_counts()
for target, count in target_counts.items():
print(f"Number of points with target = {target}: {count} {count / len(df)}")
Number of points with target=2: 477 0.3333333333333333 Number of points with target=0: 477 0.3333333333333333 Number of points with target=1: 477 0.3333333333333333
# RUN THE FOLLOWING CODE TO SHOW THE HEAT-MAP FOR THE CORRELATION MATRIX
corr = df.corr(); #print(corr) #COMPUTE CORRELATION OF FEATER MATRIX
print(corr.shape)
sns.set_theme(style="white")
f, ax = plt.subplots(figsize=(20, 20)) # Set up the matplotlib figure
cmap = sns.diverging_palette(230, 20, as_cmap=True) # Generate a custom diverging colormap
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, annot=True, cmap=cmap, vmin=-1, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show();
(10, 10)
When the dataset is very large then the seaborn pairplot is often very slow.
However, in this case it can still be useful to look at a subset of the features
# INSERT CODE TO MAKE DATA-FRAMES (or numpy arrays) (X,Y) WHERE Y="target" COLUMN and X="everything else"
X = df.drop(columns=['target'])
Y = df['target']
#X_array = X.values
#Y_array = Y.values
# INSERT CODE TO PARTITION THE DATASET INTO TRAINING AND TEST SETS
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
# INSERT CODE, AS A CONSISTENCY CHECK, TO PRINT THE TYPE AND SHAPE OF x_train, x_test, y_train, y_test
print(type(x_train), x_train.shape)
print(type(y_train), y_train.shape)
print(type(x_test), x_test.shape)
print(type(y_test), y_test.shape)
<class 'pandas.core.frame.DataFrame'> (1144, 9) <class 'pandas.core.series.Series'> (1144,) <class 'pandas.core.frame.DataFrame'> (287, 9) <class 'pandas.core.series.Series'> (287,)
## INSERT CODE BELOW TO TRAIN A SKLEARN DECISION TREE MODEL ON x_train,y_train
model = tree.DecisionTreeClassifier()
model = model.fit(x_train, y_train)
Evaluate the performance of the decision tree model by using the test data.
# INSERT CODE TO USE THE MODEL TO MAKE PREDICTIONS FOR THE TRAINING AND TEST SET
yp_train = model.predict(x_train)
yp_test = model.predict(x_test)
Use the following reference to display the confusion matrix. SKlearn Confusion Matrix will give you the code you need.
In the function below, also print the following as part of the function output
ACCURACY: 0.9035087719298246
NEGATIVE RECALL (Y=0): 0.9574468085106383
NEGATIVE PRECISION (Y=0): 0.8333333333333334
POSITIVE RECALL (Y=1): 0.8656716417910447
POSITIVE PRECISION (Y=1): 0.9666666666666667
[[45 2]
[ 9 58]]
#INSERT CODE TO WRITE A FUNCTION def confusion_plot(y_data,y_pred) WHICH GENERATES A CONFUSION MATRIX PLOT AND PRINTS THE INFORMATION ABOVE (see link above for example)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def confusion_plot(y,yp):
cm = confusion_matrix(y, yp)
accuracy = accuracy_score(y, yp)
precision_0 = precision_score(y, yp, pos_label=0)
precision_1 = precision_score(y, yp, pos_label=1)
recall_0 = recall_score(y, yp, pos_label=0)
recall_1 = recall_score(y, yp, pos_label=1)
print("ACCURACY:", accuracy)
print("NEGATIVE RECALL (Y=0):", recall_0)
print("NEGATIVE PRECISION (Y=0):", precision_0)
print("POSITIVE RECALL (Y=1):", recall_1)
print("POSITIVE PRECISION (Y=1):", precision_1)
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
def confusion_plot(y, yp):
cm = confusion_matrix(y, yp)
accuracy = accuracy_score(y, yp)
precision_0 = precision_score(y, yp, average=None)[0]
precision_1 = precision_score(y, yp, average=None)[1]
precision_2 = precision_score(y, yp, average=None)[2]
recall_0 = recall_score(y, yp, average=None)[0]
recall_1 = recall_score(y, yp, average=None)[1]
recall_2 = recall_score(y, yp, average=None)[2]
# Set the figure size directly
plt.figure(figsize=(8, 8))
print("ACCURACY:", accuracy)
print("CLASS 0 RECALL:", recall_0)
print("CLASS 0 PRECISION:", precision_0)
print("CLASS 1 RECALL:", recall_1)
print("CLASS 1 PRECISION:", precision_1)
print("CLASS 2 RECALL:", recall_2)
print("CLASS 2 PRECISION:", precision_2)
print(cm)
# Create ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()
# Example usage:
# confusion_plot(y_true, y_pred)
# RUN THE FOLLOWING CODE TO TEST YOUR FUNCTION
print("------TRAINING------")
confusion_plot(y_train,yp_train)
print("------TEST------")
confusion_plot(y_test,yp_test)
------TRAINING------ ACCURACY: 1.0 CLASS 0 RECALL: 1.0 CLASS 0 PRECISION: 1.0 CLASS 1 RECALL: 1.0 CLASS 1 PRECISION: 1.0 CLASS 2 RECALL: 1.0 CLASS 2 PRECISION: 1.0 [[383 0 0] [ 0 377 0] [ 0 0 384]]
<Figure size 576x576 with 0 Axes>
------TEST------ ACCURACY: 0.9930313588850174 CLASS 0 RECALL: 0.9893617021276596 CLASS 0 PRECISION: 0.9893617021276596 CLASS 1 RECALL: 0.99 CLASS 1 PRECISION: 1.0 CLASS 2 RECALL: 1.0 CLASS 2 PRECISION: 0.9893617021276596 [[93 0 1] [ 1 99 0] [ 0 0 93]]
<Figure size 576x576 with 0 Axes>
# INSERT CODE TO WRITE A FUNCTION "def plot_tree(model,X,Y)" VISUALIZE THE DECISION TREE (see https://mljar.com/blog/visualize-decision-tree/ for an example)
def plot_tree(model, X, Y):
plt.figure(figsize=(20, 10))
tree.plot_tree(model, feature_names=X.columns, class_names=[str(class_label) for class_label in model.classes_], filled=True)
plt.show()
plot_tree(model, X, Y)
The "max_depth" hyper-parameter lets us control the number of layers in our tree.
Lets iterate over "max_depth" and try to find the set of hyper-parameters with the lowest training AND test error.
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score
# COMPLETE THE FOLLOWING CODE TO LOOP OVER POSSIBLE HYPER-PARAMETERS VALUES
test_results = []
train_results = []
for num_layer in range(1, 20):
model = tree.DecisionTreeClassifier(max_depth=num_layer)
model = model.fit(x_train, y_train)
yp_train = model.predict(x_train)
yp_test = model.predict(x_test)
test_results.append([num_layer, accuracy_score(y_test, yp_test), precision_score(y_test, yp_test, average=None),
recall_score(y_test, yp_test, average=None)])
train_results.append([num_layer, accuracy_score(y_train, yp_train), precision_score(y_train, yp_train, average=None),
recall_score(y_train, yp_train, average=None)])
# Extract results for plotting
num_layers = [result[0] for result in test_results]
# Accuracy
test_accuracies = [result[1] for result in test_results]
train_accuracies = [result[1] for result in train_results]
# Recall
test_recalls_class_0 = [result[2][0] for result in test_results]
train_recalls_class_0 = [result[2][0] for result in train_results]
test_recalls_class_1 = [result[2][1] for result in test_results]
train_recalls_class_1 = [result[2][1] for result in train_results]
test_recalls_class_2 = [result[2][2] for result in test_results]
train_recalls_class_2 = [result[2][2] for result in train_results]
# Precision
test_precisions_class_0 = [result[3][0] for result in test_results]
train_precisions_class_0 = [result[3][0] for result in train_results]
test_precisions_class_1 = [result[3][1] for result in test_results]
train_precisions_class_1 = [result[3][1] for result in train_results]
test_precisions_class_2 = [result[3][2] for result in test_results]
train_precisions_class_2 = [result[3][2] for result in train_results]
# Accuracy
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_accuracies, 'o-', label='Test Accuracy')
plt.plot(num_layers, train_accuracies, 'o-', label='Train Accuracy')
plt.title('Accuracy vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# Recall Class 0
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_recalls_class_0, 'o-', label='Test Recall Class 0')
plt.plot(num_layers, train_recalls_class_0, 'o-', label='Train Recall Class 0')
plt.title('Recall Class 0 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Recall Class 0')
plt.legend()
plt.show()
# Recall Class 1
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_recalls_class_1, 'o-', label='Test Recall Class 1')
plt.plot(num_layers, train_recalls_class_1, 'o-', label='Train Recall Class 1')
plt.title('Recall Class 1 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Recall Class 1')
plt.legend()
plt.show()
# Recall Class 2
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_recalls_class_2, 'o-', label='Test Recall Class 2')
plt.plot(num_layers, train_recalls_class_2, 'o-', label='Train Recall Class 2')
plt.title('Recall Class 2 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Recall Class 2')
plt.legend()
plt.show()
# Precision Class 0
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_precisions_class_0, 'o-', label='Test Precision Class 0')
plt.plot(num_layers, train_precisions_class_0, 'o-', label='Train Precision Class 0')
plt.title('Precision Class 0 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Precision Class 0')
plt.legend()
plt.show()
# Precision Class 1
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_precisions_class_1, 'o-', label='Test Precision Class 1')
plt.plot(num_layers, train_precisions_class_1, 'o-', label='Train Precision Class 1')
plt.title('Precision Class 1 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Precision Class 1')
plt.legend()
plt.show()
# Precision Class 2
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_precisions_class_2, 'o-', label='Test Precision Class 2')
plt.plot(num_layers, train_precisions_class_2, 'o-', label='Train Precision Class 2')
plt.title('Precision Class 2 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Precision Class 2')
plt.legend()
plt.show()
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Re-train the decision tree using the optimal hyper-parameter obtained from the plot above
#### COMPLETE THE CODE BELOW TO TRAIN A SKLEARN DECISION TREE MODEL ON x_train,y_train
from sklearn import tree
model = tree.DecisionTreeClassifier(max_depth=4)
model = model.fit(x_train, y_train)
yp_train=model.predict(x_train)
yp_test=model.predict(x_test)
# RUN THE FOLLOWING CODE TO EVALUATE YOUR MODEL
print("------TRAINING------")
confusion_plot(y_train,yp_train)
print("------TEST------")
confusion_plot(y_test,yp_test)
plot_tree(model,X,Y)
------TRAINING------ ACCURACY: 1.0 CLASS 0 RECALL: 1.0 CLASS 0 PRECISION: 1.0 CLASS 1 RECALL: 1.0 CLASS 1 PRECISION: 1.0 CLASS 2 RECALL: 1.0 CLASS 2 PRECISION: 1.0 [[383 0 0] [ 0 377 0] [ 0 0 384]]
<Figure size 576x576 with 0 Axes>
------TEST------ ACCURACY: 0.9930313588850174 CLASS 0 RECALL: 0.9893617021276596 CLASS 0 PRECISION: 0.9893617021276596 CLASS 1 RECALL: 0.99 CLASS 1 PRECISION: 1.0 CLASS 2 RECALL: 1.0 CLASS 2 PRECISION: 0.9893617021276596 [[93 0 1] [ 1 99 0] [ 0 0 93]]
<Figure size 576x576 with 0 Axes>
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=200, min_samples_split=4, min_samples_leaf=3, max_features='log2', max_depth=70)
# Train the model
rf_classifier.fit(x_train, y_train)
# Make predictions on the test set
y_pred = rf_classifier.predict(x_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")
Accuracy: 0.9930313588850174
Confusion Matrix:
[[93 0 1]
[ 1 99 0]
[ 0 0 93]]
Classification Report:
precision recall f1-score support
0 0.99 0.99 0.99 94
1 1.00 0.99 0.99 100
2 0.99 1.00 0.99 93
accuracy 0.99 287
macro avg 0.99 0.99 0.99 287
weighted avg 0.99 0.99 0.99 287
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
# Plot feature importances
feature_importances = rf_classifier.feature_importances_
feature_names = X.columns
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=feature_names, palette='viridis')
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
<ipython-input-143-6fd300f0b582>:18: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=feature_importances, y=feature_names, palette='viridis')
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Define the parameter distributions
param_dist = {
'n_estimators': np.arange(50, 501, 50),
'max_depth': [None] + list(np.arange(10, 101, 10)),
'min_samples_split': np.arange(2, 11),
'min_samples_leaf': np.arange(1, 11),
'max_features': ['auto', 'sqrt', 'log2', None],
'bootstrap': [True, False]
}
# Create the base model
rf_base = RandomForestClassifier(random_state=42)
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_base, param_distributions=param_dist,
n_iter=100, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
# Fit the model to the data
random_search.fit(x_train, y_train)
# Print the best parameters
print("Best Parameters:", random_search.best_params_)
# Get the best model
best_rf_model = random_search.best_estimator_
# Evaluate the model on the test set
y_pred = best_rf_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
# Visualize the effect of two hyperparameters
param1 = 'n_estimators'
param2 = 'max_depth'
# Extract the results from the RandomizedSearchCV
results = random_search.cv_results_
param1_values = results['param_' + param1].data.astype(float)
param2_values = results['param_' + param2].data.astype(float)
mean_test_scores = results['mean_test_score']
# Create a scatter plot
plt.figure(figsize=(12, 8))
sc = plt.scatter(param1_values, param2_values, c=mean_test_scores, cmap='viridis', marker='o', edgecolors='k')
plt.colorbar(sc, label='Mean Test Score')
plt.title(f'RandomizedSearchCV Results: {param1} vs {param2}')
plt.xlabel(param1)
plt.ylabel(param2)
plt.show()
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:425: FitFailedWarning:
57 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
43 fits failed with the following error:
Traceback (most recent call last):
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
estimator._validate_params()
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
validate_parameter_constraints(
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
estimator._validate_params()
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
validate_parameter_constraints(
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.
warnings.warn(some_fits_failed_message, FitFailedWarning)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py:979: UserWarning: One or more of the test scores are non-finite: [0.99475524 nan 0.99650273 nan 0.99388035 0.99562784
0.99562555 0.99388035 0.99475524 0.99388035 0.99388035 nan
nan 0.99650273 0.99388035 nan 0.98775611 0.99475295
0.98775611 0.98950589 0.98775611 nan nan 0.99388035
0.99388035 0.99562555 0.99125567 0.99388035 0.99475295 0.99737533
0.99388035 nan nan nan 0.99650273 0.99213056
0.99475524 0.99562555 0.98775611 0.99562555 0.99475524 0.98775611
0.99650044 0.99388035 0.99388035 0.99038078 0.99388035 0.99562555
0.99388035 0.99388035 0.99388035 0.98775611 0.99388035 0.99650273
0.99475524 0.99388035 nan 0.99388035 nan 0.99562784
nan nan nan 0.99562784 0.99388035 0.99388035
0.99388035 0.99650273 0.99562784 0.99475524 0.99388035 0.99388035
0.99475524 0.99562784 nan nan 0.99650273 0.99650273
0.99388035 0.99388035 0.99475295 0.99388035 0.99563013 0.99388035
0.98950589 nan 0.99213056 nan 0.99562784 0.99475295
0.99038078 0.99388035 0.98775611 0.99562555 0.99562784 0.99650273
0.99388035 0.99212827 0.99562555 0.99388035]
warnings.warn(
Best Parameters: {'n_estimators': 200, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_depth': 70, 'bootstrap': False}
Test Accuracy: 0.9930313588850174
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Define the parameter grid
param_grid = {
'n_estimators': np.arange(50, 501, 50),
'max_depth': [None] + list(np.arange(10, 101, 10)),
'min_samples_split': np.arange(2, 11),
'min_samples_leaf': np.arange(1, 11),
'max_features': ['auto', 'sqrt', 'log2', None]
}
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
rf_classifier,
param_distributions=param_grid,
n_iter=10, # You can adjust the number of iterations
scoring='accuracy',
cv=5, # You can adjust the number of cross-validation folds
random_state=42,
n_jobs=-1 # Use -1 to use all available CPU cores
)
# Fit the RandomizedSearchCV
random_search.fit(x_train, y_train)
# Get the best hyperparameters
best_params = random_search.best_params_
# Initialize the base model with the best hyperparameters
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
# Lists to store results
train_accuracies = []
test_accuracies = []
# Plotting n_estimators over accuracy
plt.figure(figsize=(10, 6))
for n_estimators in param_grid['n_estimators']:
best_rf_model.n_estimators = n_estimators
best_rf_model.fit(x_train, y_train)
train_pred = best_rf_model.predict(x_train)
test_pred = best_rf_model.predict(x_test)
train_accuracies.append(accuracy_score(y_train, train_pred))
test_accuracies.append(accuracy_score(y_test, test_pred))
# Plot points with dots and connect with lines
plt.plot(param_grid['n_estimators'], train_accuracies, 'o-', label='Train Accuracy')
plt.plot(param_grid['n_estimators'], test_accuracies, 'o-', label='Test Accuracy')
plt.title('Effect of n_estimators on Accuracy')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# Similar code can be used for other hyperparameters (max_depth, min_samples_split, min_samples_leaf, max_features)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:425: FitFailedWarning:
15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
estimator._validate_params()
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
validate_parameter_constraints(
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
estimator._validate_params()
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
validate_parameter_constraints(
File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.
warnings.warn(some_fits_failed_message, FitFailedWarning)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py:979: UserWarning: One or more of the test scores are non-finite: [0.99562936 nan 0.99562936 nan 0.99388263 0.99562936
0.99562936 nan 0.99562936 0.99388263]
warnings.warn(
# Lists to store results
train_accuracies = []
test_accuracies = []
# Plotting max_depth over accuracy
plt.figure(figsize=(10, 6))
for max_depth in param_grid['max_depth']:
best_rf_model.max_depth = max_depth
best_rf_model.fit(x_train, y_train)
train_pred = best_rf_model.predict(x_train)
test_pred = best_rf_model.predict(x_test)
train_accuracies.append(accuracy_score(y_train, train_pred))
test_accuracies.append(accuracy_score(y_test, test_pred))
# Plot points with dots and connect with lines
plt.plot(param_grid['max_depth'], train_accuracies, 'o-', label='Train Accuracy')
plt.plot(param_grid['max_depth'], test_accuracies, 'o-', label='Test Accuracy')
plt.title('Effect of max_depth on Accuracy')
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# Lists to store results
train_accuracies = []
test_accuracies = []
# Plotting min_samples_split over accuracy
plt.figure(figsize=(10, 6))
for min_samples_split in param_grid['min_samples_split']:
best_rf_model.min_samples_split = min_samples_split
best_rf_model.fit(x_train, y_train)
train_pred = best_rf_model.predict(x_train)
test_pred = best_rf_model.predict(x_test)
train_accuracies.append(accuracy_score(y_train, train_pred))
test_accuracies.append(accuracy_score(y_test, test_pred))
# Plot points with dots and connect with lines
plt.plot(param_grid['min_samples_split'], train_accuracies, 'o-', label='Train Accuracy')
plt.plot(param_grid['min_samples_split'], test_accuracies, 'o-', label='Test Accuracy')
plt.title('Effect of min_samples_split on Accuracy')
plt.xlabel('min_samples_split')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# Lists to store results
train_accuracies = []
test_accuracies = []
# Plotting min_samples_leaf over accuracy
plt.figure(figsize=(10, 6))
for min_samples_leaf in param_grid['min_samples_leaf']:
best_rf_model.min_samples_leaf = min_samples_leaf
best_rf_model.fit(x_train, y_train)
train_pred = best_rf_model.predict(x_train)
test_pred = best_rf_model.predict(x_test)
train_accuracies.append(accuracy_score(y_train, train_pred))
test_accuracies.append(accuracy_score(y_test, test_pred))
# Plot points with dots and connect with lines
plt.plot(param_grid['min_samples_leaf'], train_accuracies, 'o-', label='Train Accuracy')
plt.plot(param_grid['min_samples_leaf'], test_accuracies, 'o-', label='Test Accuracy')
plt.title('Effect of min_samples_leaf on Accuracy')
plt.xlabel('min_samples_leaf')
plt.ylabel('Accuracy')
plt.legend()
plt.show()